Libraries

library(librarian)
librarian::shelf(dplyr, stringr, checkmate,
                 Seurat, SeuratDisk,
                 ggplot2, gridExtra, cowplot,
                 quiet = TRUE)
if(!exists("databringr", mode="function")) source("utils.R")

Set working directory:

if (Sys.getenv("RSTUDIO")==1) {
   # setwd to where the editor is, if the IDE is RStudio
  setwd( dirname(rstudioapi::getSourceEditorContext(id = NULL)$path) )
} else {
  # setwd to where the editor is in a general way - maybe less failsafe than the previous
  setwd(here::here())
  # the following checks that the latter went well, but assuming
  # that the user has not changed the name of the repo
  d <- str_split(getwd(), fsep)[[1]][length(str_split(getwd(), fsep)[[1]])]
  if (d != 'Puigetal2023_bioinformatics_scripts') { stop(
    paste0("Could not set working directory automatically to where this",
           " script resides.\nPlease do `setwd()` manually"))
    }
}

To save images outside the repo (to reduce size):

figdir <- paste0(c(head(str_split(getwd(), fsep)[[1]],-1),
                   paste0(tail(str_split(getwd(), fsep)[[1]],1), '_figures')),
                 collapse = fsep)
dir.create(figdir, showWarnings = FALSE)

1 Single-cell RNAseq data from Hung et al., 2020 (data1)

These data come from Hung RJ, Hu Y, Kirchner R, et al. A cell atlas of the adult Drosophila midgut. Proc Natl Acad Sci U S A. 2020;117(3):1514-1523. doi:[10.1073/pnas.1916820117](https://doi.org/10.1073/pnas.1916820117).

The processed data are available at the GEO with accession number GSE120537. However, the data were generated using 10x and inDrop technologies, and the supplementary files at GEO do not contain the fully integrated assay and clusters. Integration could be performed using Hung et al. (2020) published pipeline, but this can be somewhat machine-dependent (see e.g. here). Therefore, we will use the fully integrated dataset generated by Hung et al., kindly provided by Dr Claire Yanhui Hu.

As the dataset has already been integrated, let’s select the raw data as the default assay. Let’s add one more metadata to this set of cells indicating that they come from ‘dataset1’. Then we remove the mitochondrial and ribosomal genes and separate the dataset by technology, obtaining the two samples.

# get the dataset
inpath <- file.path(getwd(), 'input')
if (!file.exists( file.path(inpath, 'integrated_Hung2020.rds') )) {
databringr(from = 'http://genomics.essex.ac.uk/flygut/integrated_Hung2020.rds',
           to = file.path(inpath, 'integrated_Hung2020.rds'))
}

# load data
data1 <- readRDS(file.path("input", "integrated_Hung2020.rds"))
as.data.frame.table(table((data1)$technology)) %>%
  dplyr::rename(celltype=Var1, ncells=Freq) %>%
  knitr::kable()
celltype ncells
10x 2979
inDrop 7626
# check mitochondrial/ribosomal genes
DefaultAssay(data1) <- "RNA"
data1[["percent.mt"]] <- PercentageFeatureSet(data1, pattern = '^mt:+')
data1[["percent.ribo"]] <- PercentageFeatureSet(data1, pattern = '^Rp[SL]')
VlnPlot(data1, features = c("nFeature_RNA", "nCount_RNA", "percent.mt",
                              "percent.ribo"))

DimPlot(data1, reduction = "umap", group.by = "integrated_celltype") 

# quick check at present cell types
as.data.frame.table(table((data1)$integrated_celltype)) %>%
  dplyr::rename(celltype=Var1, ncells=Freq) %>%
  knitr::kable()
celltype ncells
0 2213
10 336
16 237
19 89
2 806
20 54
21 36
9 394
aEC 1828
cardia 225
EE 849
iron/copper 559
ISC/EB 799
LFC 549
mEC 449
pEC 1182
# add dataset label
data1@meta.data[["data"]] <- "data1"
data1@meta.data[["celltype"]] <- data1@meta.data[["integrated_celltype"]] 
# remove ribosomal and mitochondrial genes
data1 <- data1[ ! grepl('^Rp[SL]', rownames(data1)), ]
data1 <- data1[ ! grepl('^mt:+', rownames(data1)), ]
# divide cells by scRNAseq technology
object_1 <- SplitObject(data1, split.by = "technology")
rm(data1)
tenx_1 <- object_1[["10x"]]
indrop <- object_1[["inDrop"]]
rm('object_1')

2 Single-cell RNAseq data from Li et al., 2022 (data2)

These data come from Li H, Janssens J, De Waegeneer M, et al. Fly Cell Atlas: A single-nucleus transcriptomic atlas of the adult fruit fly. Science. 2022;375(6584):eabk2432. doi:[10.1126/science.abk2432](https://doi.org/10.1126/science.abk2432).

Links to integrated 10x and Smart-seq2 intestine data are available from the Fly Cell Atlas website. The data is in h5ad format, so we need to convert it to a readable format for Seurat. We did the same steps as the previous data set and separated the data by technology and sex, obtaining individual samples.

if(file.exists( file.path(inpath, 'integrated_Li2022.rds') )) {
  data2 <- readRDS( file.path(inpath, 'integrated_Li2022.rds') )
} else {
  if (!file.exists( file.path(getwd(), 'input', 'data2.h5seurat') )) {
    databringr(from = 'https://cloud.flycellatlas.org/index.php/s/8kHHzDSo48FLjTm/download/gut.h5ad',
               to = file.path(inpath, 'data2.h5ad'))
    Convert((file.path(inpath, "data2.h5ad")), 
             file.path(inpath, "data2.h5seurat"))
  }
  # load data, show cell types by tech and sex
  data2 <- LoadH5Seurat(file.path("input", "data2.h5seurat"))
  DefaultAssay(data2) <- "RNA"
  # remove ribosomal/mitochondrial genes
  data2 <- data2[ ! grepl('^Rp[SL]', rownames(data2)), ]
  data2 <- data2[ ! grepl('^mt:+', rownames(data2)), ]
  # add dataset label
  data2@meta.data[["data"]] <- "data2"
  saveRDS(data2, file.path(file.path(inpath, 'integrated_Li2022.rds')))
}
# visualisation: number of cells per tech/sex/type and UMAP
as.data.frame.table(table((data2)$technology)) %>%
  dplyr::rename(celltype=Var1, ncells=Freq) %>%
  knitr::kable()
celltype ncells
10x 11788
ss2 594
as.data.frame.table(table((data2)$sex)) %>%
  dplyr::rename(celltype=Var1, ncells=Freq) %>%
  knitr::kable()
celltype ncells
female 6640
male 5742
DimPlot(data2, reduction = "umap", group.by = "annotation",
        label = TRUE, label.size = 3, repel = TRUE) + NoLegend()

as.data.frame.table(table((data2)$annotation)) %>%
  dplyr::rename(celltype=Var1, ncells=Freq) %>%
  knitr::kable()
celltype ncells
adult Malpighian tubule 38
adult differentiating enterocyte 299
adult fat body 15
adult midgut enterocyte 282
adult midgut-hindgut hybrid zone 294
adult pylorus 191
antimicrobial peptide-producing cell 304
cardia (1) 1150
cardia (2) 318
copper cell 95
crop 4715
enteroblast 225
enterocyte of anterior adult midgut epithelium 779
enterocyte of posterior adult midgut epithelium 580
enterocyte-like 1027
enteroendocrine cell 104
intestinal stem cell 305
male accessory gland 56
midgut large flat cell 264
muscle cell 100
unknown 760
visceral muscle of the crop 205
visceral muscle of the midgut 276

Prepare for integration by splitting cells according to sex and technology.

# divide cells by scRNAseq technology and sex
object_2 <- SplitObject(data2, split.by = "technology")
rm(data2)
ss2 <- object_2[["ss2"]]
ss2_split <- SplitObject(ss2, split.by = "sex")
ss2_split[["male"]]@meta.data[["sample"]] <- "ss2_1"
ss2_split[["female"]]@meta.data[["sample"]] <- "ss2_2"
ss2 <- merge(ss2_split[["male"]], c(ss2_split[["female"]]),
                 add.cell.ids = c("male","female"))
tenx_2 <- object_2[["10x"]]
tenx_2_split <- SplitObject(tenx_2, split.by = "sex")
tenx_2_split[["male"]]@meta.data[["sample"]] <- "10x_3"
tenx_2_split[["female"]]@meta.data[["sample"]] <- "10x_4"
tenx_2 <- merge(tenx_2_split[["male"]], c(tenx_2_split[["female"]]),
             add.cell.ids = c("male","female"))
rm('object_2', 'tenx_2_split', 'ss2_split')

3 Merge and visualise

We merged the two datasets. We scaled the data and did the dimensionality reduction before integration. We can see that the datasets have batch effects coming from technology and experiment. We must then proceed to data integration.

outpath <- file.path(getwd(), 'output')
if (!file.exists( file.path(outpath, 'non_integrated_merged.rds') )) {
  alldata <- merge(indrop, c(ss2,tenx_1,tenx_2),
                   add.cell.ids = c("indrop","ss2","tenx_1","tenx_2"))
  rm('indrop', 'ss2', 'tenx_1', 'tenx_2')
  # variable features
  alldata <- FindVariableFeatures(alldata, nfeatures = 2000)
  top20 <- head(VariableFeatures(alldata), 20)
  LabelPoints(plot = VariableFeaturePlot(alldata), points = top20, repel = TRUE)
  # scaling and dimensionality reduction
  alldata <- ScaleData(alldata, verbose = FALSE)
  alldata <- RunPCA(alldata, npcs = 30, verbose = FALSE)
  alldata <- RunUMAP(alldata, dims = 1:30, verbose = FALSE)
  # cache
  if (!file.exists( outpath )) dir.create( outpath )
  saveRDS(alldata, file.path(outpath, 'non_integrated_merged.rds'))
} else {
  alldata <- readRDS(file.path(outpath, 'non_integrated_merged.rds'))
}
# check batch effects by sample/tech
p1 <- DimPlot(alldata, reduction = "umap", group.by = "data") 
p1

# segregation by technology/experiment
p2 <- DimPlot(alldata, reduction = "umap", group.by = "sample")
p2

4 Data integration and re-scaling

We use the Seurat::IntegrateData function to integrate based on sample (independent experiments by technology). The algorithm identifies anchors, which are pairs of cross-datasets of cells that are in a compatible biological state, correcting for batch effects.

if (file.exists( file.path(outpath, 'alldata-int-cache1.rds') )) {
  alldata.int <- readRDS( file.path(outpath, 'alldata-int-cache1.rds') )
} else {
  # find anchor candidates per sample
  alldata.list <- SplitObject(alldata, split.by = "sample")
  rm(alldata)
  for (i in 1:length(alldata.list)) {
    alldata.list[[i]] <- ScaleData(alldata.list[[i]], verbose = FALSE)
    alldata.list[[i]] <- NormalizeData(alldata.list[[i]], verbose = FALSE)
    alldata.list[[i]] <- FindVariableFeatures(alldata.list[[i]], selection.method = "vst",
                                              nfeatures = 2000, verbose = FALSE)}
  # identify anchors and integrate
  alldata.anchors <- FindIntegrationAnchors(alldata.list, dims = 1:30, verbose = FALSE)
  to_integrate <- Reduce(intersect, lapply(alldata.anchors@object.list, rownames))
  # this step is quite intensive
  alldata.int <- IntegrateData(alldata.anchors, dims = 1:30, new.assay.name = "CCA", 
                               features.to.integrate = to_integrate,
                               verbose = FALSE)
  # manual cache for knitting the RMD into html
  saveRDS(alldata.int, file.path(outpath, 'alldata-int-cache1.rds'))
}

Now we can scale the integrated data, perform dimensionality reduction by PCA (retaining the 30 first principal components) and visualise with UMAP and tSNE. With this, we can confirm that cells were in compatible biological states. However, due to the presence of different cell types, we observed differences in the clustering patterns.

if (file.exists( file.path(outpath, 'alldata-int-cache2.rds') )) {
  alldata.int <- readRDS( file.path(outpath, 'alldata-int-cache2.rds') )
} else {
  # rescale & PCA
  alldata.int <- ScaleData(alldata.int, verbose = FALSE)
  alldata.int <- RunPCA(alldata.int, npcs = 50, verbose = FALSE)
  # show
  DimPlot(alldata.int, reduction = "pca", group.by = "sample")
  DimHeatmap(alldata.int, dims = 1, cells = 500, balanced = TRUE)
  DimHeatmap(alldata.int, dims = 1:15, cells = 500, balanced = TRUE)
  ElbowPlot(alldata.int, ndims = 50)
  # 30 PCs is enough
  alldata.int <- RunUMAP(alldata.int, dims = 1:30, verbose = FALSE)
  alldata.int <- RunTSNE(alldata.int, dims = 1:30)
  saveRDS(alldata.int, file.path(outpath, 'alldata-int-cache2.rds'))
}

# cells from different datasets and experiments now mingle
p3 <- DimPlot(alldata.int, reduction = "umap", group.by = "data") 
p3

p4 <- DimPlot(alldata.int, reduction = "umap", group.by = "sample") 
p4

5 Cell type clustering with integrated dataset

From here, we can use the dimensionally-reduced, integrated data to group cells of the same cell type. We use the Seurat::FindNeighbors function to calculate the KNN and SNN graphs; FindClusters will detect clusters based on these graphs using the “Louvain” algorithm (at resolution = 1).

alldata.int <- FindNeighbors(alldata.int, dims = 1:30)
# explore resolution parameter (more resolution gives more clusters)
for (res in c(0.1, 0.25, 0.5, 0.6, 1, 1.5, 2)) {
  alldata.int <- FindClusters(alldata.int, resolution = res, algorithm = 1)
}
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22987
## Number of edges: 941777
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9836
## Number of communities: 17
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22987
## Number of edges: 941777
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9727
## Number of communities: 26
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22987
## Number of edges: 941777
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9583
## Number of communities: 26
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22987
## Number of edges: 941777
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9530
## Number of communities: 27
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22987
## Number of edges: 941777
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9353
## Number of communities: 37
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22987
## Number of edges: 941777
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9183
## Number of communities: 43
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22987
## Number of edges: 941777
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9032
## Number of communities: 50
## Elapsed time: 3 seconds
plot_grid(ncol = 3,
  DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.0.1") + 
    NoAxes() + ggtitle("louvain_0.1"),
  DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.0.25") +
    NoAxes() + ggtitle("louvain_0.25"),
  DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.0.5") +
    NoAxes() + ggtitle("louvain_0.5"),
  DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.1") +
    NoAxes() + ggtitle("louvain_1"),
  DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.1.5") +
    NoAxes() + ggtitle("louvain_1.5"),
  DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.2") +
    NoAxes() + ggtitle("louvain_2")
)

# choose resolution = 1
alldata.int <- SetIdent(alldata.int, value = "CCA_snn_res.1")
table(alldata.int@active.ident)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 1379 1374 1309 1181 1082 1077 1035  938  906  870  815  779  765  681  653  652 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##  649  624  577  569  554  444  443  428  391  355  341  325  293  283  269  259 
##   32   33   34   35   36 
##  251  231  114   58   33
DimPlot(alldata.int, reduction = "tsne", group.by = "CCA_snn_res.1")

DefaultAssay(alldata.int) <- "RNA"

Heatmap of known markers

To help with the annotation of clusters, we start with a heatmap of the markers described by Hung et al. (2020). The file cell_type_markers.csv was also provided by Dr Y Hu.

# read markers
markers <- readr::read_csv(file.path("resources","cell_type_markers.csv"), col_types = 'cc')
knitr::kable(markers)
cell_identity genes
ISC Dl
ISC Smvt
ISC CG10006
pISC sna
ISCR2R5 polo
ISC/EB esg
EB E(spl)m3-HLH
EB E(spl)malpha-BFM
EB E(spl)mbeta-HLH
EB klu
EE pros
EE 7B2
EE_subtype AstA
EE_subtype AstC
EE_subtype CCHa1
EE_subtype CCHa2
EE_subtype Tk
EE_subtype NPF
EE_subtype Orcokinin
EE_subtype Dh31
aEC nub
aEC Myo31DF
mEC nub
mEC Myo31DF
pEC nub
pEC Myo31DF
aEC betaTry
pEC lambdaTry
mEC lab
mEC Vha100-4
cardia Pgant4
copper PGRP-LA
copper PGRP-LC
copper Apoltp
VM Vn
LFC Ilp3
LFC PGRP-SC1a
LFC PGRP-SC1b
LFC PGRP-SD
iron PGRP-SC2
iron ZIP1(Zip42C.1)
# set integrated clustering as cell identity
Idents(alldata.int) <- alldata.int@meta.data$CCA_snn_res.1
# get genome-wide gene expression per cluster, and visualise for marker genes
cluster.averages <- AverageExpression(alldata.int, return.seurat=TRUE, add.ident="sample")
DoHeatmap(cluster.averages, features=head(markers, 20)$genes, size=2)

DoHeatmap(cluster.averages, features=tail(markers, 21)$genes, size=2)

Annotation of clusters

clusters <- alldata.int@meta.data$CCA_snn_res.1
# assign cell type to cluster
celltype <- case_when(
  clusters %in% c(0) ~ "enterocyte-like",
  clusters %in% c(1,8,9,11,12,33) ~ "crop",
  clusters %in% c(2) ~ "intestinal stem cell / enteroblast",
  clusters %in% c(3,5,6,14,19,20,23,36) ~ "enterocyte of anterior midgut epithelium",
  clusters %in% c(4,10,27) ~ "cardia",
  clusters %in% c(7) ~ "midgut large flat cell",
  clusters %in% c(13,16,21,31) ~ "enterocyte of posterior midgut epithelium",
  clusters %in% c(15,22,29) ~ "midgut enterocyte",
  clusters %in% c(17,28) ~ "enteroendocrine cell",
  clusters %in% c(18) ~ "muscle cell",
  clusters %in% c(24) ~ "differentiating enterocyte",
  clusters %in% c(25) ~ "antimicrobial peptide-producing cell",
  clusters %in% c(26) ~ "midgut-hindgut hybrid zone",
  clusters %in% c(30) ~ "pylorus",
  clusters %in% c(34) ~ "male accessory gland",
  clusters %in% c(35) ~ "malpighian tubule",
  clusters %in% c(32) ~ "unkown",
  TRUE ~ as.character(clusters))
# set integrated cell types as new cell identity
alldata.int@meta.data$integrated_celltype <- celltype
Idents(alldata.int) <- alldata.int@meta.data$integrated_celltype
# get genome-wide gene expression per cluster, and visualise
integratedcelltype.averages <- AverageExpression(alldata.int, return.seurat=TRUE)
DoHeatmap(integratedcelltype.averages, features=markers$genes, size=2)

DimPlot(alldata.int, group.by="integrated_celltype", label=TRUE)

DimPlot(alldata.int, group.by="integrated_celltype", label=TRUE, reduction = "pca")

DimPlot(alldata.int, group.by="integrated_celltype", label=TRUE, reduction = "tsne" )

Differentiate between ISCs and EBs

This is something that could not be possible in Hung et al., (2020), but maybe now, with more cells, can be done. So we took a closer look at the ISC/EB group.

# extract ISC/EB cluster from all cells
alldata.list <- SplitObject(alldata.int, split.by = "integrated_celltype")
isc_cluster <- alldata.list[["intestinal stem cell / enteroblast"]]
# re-cluster independently
DefaultAssay(isc_cluster) <- "CCA"
isc_cluster <- FindNeighbors(isc_cluster, dims = 1:30)
isc_cluster <- FindClusters(isc_cluster, resolution = 1, algorithm = 1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1309
## Number of edges: 61492
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6613
## Number of communities: 7
## Elapsed time: 0 seconds
DefaultAssay(isc_cluster) <- "RNA"
Idents(isc_cluster) <- isc_cluster@meta.data$CCA_snn_res.1
table(Idents(isc_cluster) )
## 
##   0   1   2   3   4   5   6 
## 251 240 189 171 163 154 141
# evaluate marker expression in clusters
isc <- c("Dl", "Smvt", "sna", "polo", "stf", "cnn")
eb <- c("klu", "E(spl)m3-HLH", "E(spl)malpha-BFM", "E(spl)mbeta-HLH")
cluster.averages <- AverageExpression(isc_cluster, return.seurat=TRUE, add.ident="sample")
DoHeatmap(cluster.averages, features=isc, size=2)

DoHeatmap(cluster.averages, features=eb, size=2)

With this, we can now separate ISCs and EBs:

# discriminate ISCs/EBs based on expression of ISC/EB markers
clusters <- isc_cluster@meta.data$CCA_snn_res.1
celltype <- case_when(
  clusters %in% c(1,5,6) ~ "enteroblast",
  clusters %in% c(0,2,3,4) ~ "intestinal stem cell",
  TRUE ~ as.character(clusters))
# apply and check marker expression
isc_cluster@meta.data$specific_celltype <- celltype
Idents(isc_cluster) <- isc_cluster@meta.data$specific_celltype
isc.eb_averages <- AverageExpression(isc_cluster, return.seurat=TRUE, add.ident="sample")
DoHeatmap(isc.eb_averages, features=isc, size=2)

DoHeatmap(isc.eb_averages, features=eb, size=2)

# check cell distribution in UMAP
DimPlot(isc_cluster, reduction = "umap", group.by = "specific_celltype")

# include this distinction in the full dataset
isc_celltype <- isc_cluster@meta.data[["specific_celltype"]]
names(isc_celltype) <- colnames(x = isc_cluster)
int_celltype <- alldata.int@meta.data[["integrated_celltype"]]
names(int_celltype) <- colnames(x = alldata.int)
data <- c(isc_celltype,int_celltype)
alldata.int <- AddMetaData(
  object = alldata.int,
  metadata = data,
  col.name = 'specific_celltype'
)

6 Plot and save results

Final dimensionality reduction plots

Plots of the final integrated, annotated dataset:

DimPlot(alldata.int, reduction = "umap", group.by = "specific_celltype")

DimPlot(alldata.int, reduction = "tsne", group.by = "specific_celltype")

DefaultAssay(alldata.int) <- "RNA"

Save dataset.

as.data.frame.table(table((alldata.int)$specific_celltype)) %>%
  dplyr::rename(celltype=Var1, ncells=Freq) %>%
  knitr::kable()
celltype ncells
antimicrobial peptide-producing cell 355
cardia 2222
crop 4925
differentiating enterocyte 391
enteroblast 535
enterocyte of anterior midgut epithelium 5530
enterocyte of posterior midgut epithelium 2033
enterocyte-like 1379
enteroendocrine cell 917
intestinal stem cell 774
male accessory gland 114
malpighian tubule 58
midgut enterocyte 1378
midgut large flat cell 938
midgut-hindgut hybrid zone 341
muscle cell 577
pylorus 269
unkown 251
saveRDS(alldata.int, file.path(outpath, "gut_int_annot.rds"))

Extract marker tables

We select the cell types that we are interested and, using the FindMarkers function, we find genes differentially expressed in each cell type. We will use these generated gene lists later for pseudotime analysis.

alldata.list <- SplitObject(alldata.int, split.by = "specific_celltype")

data <- alldat.int <- merge(
  alldata.list[["enterocyte of anterior midgut epithelium"]],
  c(alldata.list[["enterocyte of posterior midgut epithelium"]],
    alldata.list[["intestinal stem cell"]],
    alldata.list[["enteroblast"]],
    alldata.list[["enteroendocrine cell"]]),
  add.cell.ids = c("enterocyte of anterior adult midgut epithelium",
                   "enterocyte of posterior adult midgut epithelium",
                   "intestinal stem cell",
                   "enteroblast",
                   "enteroendocrine cell")
  )
Idents(data) <- data@meta.data$specific_celltype
if (!file.exists( file.path(outpath, "markers-mast") )) {
  dir.create(file.path(outpath, "markers-mast"))
}
if (!file.exists( file.path(outpath, "markers-mast", "enterocyte_anterior_mast.csv") )) {
  markersmast <- FindMarkers(
    data, ident.1="enterocyte of anterior midgut epithelium", test.use="MAST") %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene") %>%
    arrange(p_val_adj)
  readr::write_csv(markersmast,
                   file.path(outpath, "markers-mast",
                             "enterocyte_anterior_mast.csv"))
  markersmast <- FindMarkers(
    data, ident.1="enterocyte of posterior midgut epithelium", test.use="MAST") %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene") %>%
    arrange(p_val_adj)
  readr::write_csv(markersmast,
                   file.path(outpath, "markers-mast",
                             "enterocyte_posterior_mast.csv"))
  markersmast <- FindMarkers(
    data, ident.1="intestinal stem cell", test.use="MAST") %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene") %>%
    arrange(p_val_adj)
  readr::write_csv(markersmast,
                   file.path(outpath, "markers-mast",
                             "stem_cell_mast.csv"))
  markersmast <- FindMarkers(
    data, ident.1="enteroblast", test.use="MAST") %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene") %>%
    arrange(p_val_adj)
  readr::write_csv(markersmast,
                   file.path(outpath, "markers-mast",
                             "enteroblast_mast.csv"))
  markersmast <- FindMarkers(
    data, ident.1="enteroendocrine cell", test.use="MAST") %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene") %>%
    arrange(p_val_adj)
  readr::write_csv(markersmast,
                   file.path(outpath, "markers-mast",
                             "enteroendocrine_mast.csv"))
}

unlink( c(file.path(outpath, 'alldata-int-cache1.rds'),
          file.path(outpath, 'alldata-int-cache2.rds')) )